library(dplyr)
library(ggplot2)
library(readr)
library(cowplot)
library(NanoStringQCPro)
infn = "../data/metadata-hepb.csv"
metadata = read_csv(infn)
## Warning: Missing column names filled in: 'X13' [13]
## Parsed with column specification:
## cols(
## rowname = col_integer(),
## `Sample Name` = col_character(),
## `RCC File Name` = col_character(),
## `Clinical Group` = col_integer(),
## Month = col_character(),
## `Plate+Sample` = col_character(),
## `Spike In` = col_character(),
## `Date Extracted` = col_character(),
## `Who Extracted` = col_character(),
## Comments = col_character(),
## `Date Shipped` = col_character(),
## `RCC File Date` = col_character(),
## X13 = col_character()
## )
The metadata file needs some cleaning up so we’ll do that here. First we’ll remove spaces and drop columns with no meaning.
cnames = c("rownumber", "sample", "filename", "diseaseclass", "month", "plate_sample",
"spikein", "extract_date", "extractor", "comments", "ship_date",
"file_date", "empty")
colnames(metadata) = cnames
metadata = metadata %>% select(-empty, -rownumber)
metadata$diseaseclass = as.factor(metadata$diseaseclass)
Samplenames and metadata look unique:
nrow(metadata) == length(unique(metadata$sample))
## [1] TRUE
nrow(metadata) == length(unique(metadata$filename))
## [1] TRUE
class = as.factor(c(1,2,3,4,5,6,7))
phenotype = c("acute", "positive_tolerant", "positive_active",
"negative_inactive", "negative_chronic",
"negative_unknown", "spontaneous")
active = c("active", "inactive", "active", "inactive", "active",
"unknown", "inactive")
classdata = data.frame(diseaseclass=class, phenotype=phenotype, active=active)
metadata = metadata %>% left_join(classdata, by="diseaseclass")
metadata$es = grepl("ES", metadata$extractor)
metadata$cm = grepl("CM", metadata$extractor)
metadata$ed = grepl("ED", metadata$extractor)
metadata$zymo = grepl("zymo", metadata$extractor)
metadata$active = ifelse(metadata$diseaseclass %in% c(1,3,5), "active",
"inactive")
metadata$active = ifelse(metadata$diseaseclass %in% c(1,3,5), "active",
"inactive")
rccFiles = paste("../data/rcc/", metadata$filename, sep="")
eset = newRccSet(rccFiles=rccFiles)
## Reading RCC files...
## checkRccSet() messages:
## Optional featureData cols not found: BarCode, ProbeID, TargetSeq, Species, Comments
## The following panel housekeeping genes were found: B2M|0, GAPDH|0, RPL19|0, ACTB|0, RPLP0|0
## Warning in .local(rccSet, ...):
## Non-standard CodeClass values found in featureData (values currently recognized are "Endogenous", "Housekeeping", "Positive", and "Negative"): Ligation, Endogenous1, SpikeIn)
colnames(eset) = metadata$sample
A small set of miRNA have known inflated values; Nanostring has a heuristic to correct for the inflation via the formula using some supplied constants. Here we implement that fix to correct the expression of those miRNA.
plot_corrected_miRNA = function(scaled, eset) {
require(reshape)
require(ggplot2)
counts = exprs(eset)
rownames(counts) = rownames(scaled)
is_scaled = rowSums(abs(counts - scaled)) > 0
counts = melt(as.matrix(counts[is_scaled,]))
colnames(counts) = c("gene", "sample", "value")
counts$scaled = "no"
scaled = melt(as.matrix(scaled[is_scaled,]))
colnames(scaled) = c("gene", "sample", "value")
scaled$scaled = "yes"
all = rbind(counts, scaled)
library(cowplot)
ggplot(all, aes(sample, value, color=scaled)) +
facet_wrap(~gene, scale="free_y") +
scale_y_sqrt() +
geom_point(size=0.5) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
guides(colour = guide_legend(override.aes = list(size=6))) + ylab("") +
ggtitle("miRNA hybridization crosstalk correction")
}
correct_miRNA = function(eset, plot=TRUE) {
# scales specific features tagged known to have crosstalk by
# the Nanostring provided scaling factor
require(dplyr)
fdat = fData(eset)
fdat = fdat %>%
tidyr::separate(GeneName, c("Name", "scalefactor"), sep='\\|',
fill="right", remove=FALSE) %>%
dplyr::mutate(scalefactor=ifelse(is.na(scalefactor), 0, scalefactor))
sf = as.numeric(fdat$scalefactor)
posa = as.numeric(exprs(eset)["Positive_POS_A_ERCC_00117.1",])
scales = t(replicate(length(sf), posa))
scales = scales * sf
scaled = exprs(eset) - scales
scaled[scaled<0] = 0
print(plot_corrected_miRNA(scaled, eset))
exprs(eset) = round(scaled)
return(eset)
}
eset = correct_miRNA(eset)
This percentage should be super high, close to 100%. If there isn’t that usually means the cartridge needs to be rescanned. If it is < 3 weeks from the scan, rescanning should be okay. Nanostring folks call < 75% a failure. These all look fine.
plotFOV = function(eset, metadata) {
pdat = pData(eset) %>%
tibble::rownames_to_column() %>%
left_join(metadata, by=c("rowname"="sample"))
pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
ggplot(pdat, aes(rowname, pcounted)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(pdat$pcounted))) +
ylab("percentage of FOV counted") + xlab("sample") +
geom_hline(yintercept=75, color="red")
}
plotFOV(eset, metadata)
Binding density looks ok.
plotBD = function(eset, metadata) {
pdat = pData(eset) %>%
tibble::rownames_to_column() %>%
left_join(metadata, by=c("rowname"="sample"))
pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
ggplot(pdat, aes(rowname, BindingDensity)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(pdat$BindingDensity))) +
ylab("Binding density") + xlab("sample") +
geom_hline(yintercept=0.05, color="red") +
geom_hline(yintercept=2.25, color="red")
}
plotBD(eset, metadata)
Here I plotted the total counts vs the number of miRNA detected, where detected is it has counts > 30. There is a pretty big spread among the samples in terms of the miRNA detected. The Zymo columns seem to all have a low number of miRNA detected overall and for a given total mber of counts, a smaller number of miRNA detected. This indicates the samples using the Zymo columns are less complex than the non-Zymo samples.
plotComplexity = function(eset, metadata) {
counts = exprs(eset)
endocounts = counts[grepl("Endo", rownames(counts)),]
cdf = data.frame(total=colSums(counts), detected=colSums(counts > 10))
rownames(cdf) = colnames(counts)
cdf$sample = rownames(cdf)
cdf = cdf %>% left_join(metadata, by="sample")
ggplot(cdf, aes(total, detected, color=zymo, shape=es)) + geom_point()
}
plotComplexity(eset, metadata)
library(ggplot2)
library(dplyr)
library(cowplot)
is_positive = function(column) {
return(grepl("Pos", column))
}
is_negative = function(column) {
return(grepl("Neg", column))
}
is_spikein = function(column) {
return(grepl("Spike", column))
}
is_ligation = function(column) {
return(grepl("Ligati", column))
}
is_housekeeping = function(column) {
return(grepl("Housekee", column))
}
is_prior = function(column) {
return(grepl("miR-159", column) | grepl("miR-248", column) |
grepl("miR-254", column))
}
extract_pred = function(eset, predicate, counts=FALSE) {
if(!counts) {
counts = data.frame(exprs(eset))
} else {
counts = eset
}
toplot = counts[predicate(rownames(counts)),] %>%
tibble::rownames_to_column() %>%
tidyr::gather("sample", "count", -rowname)
colnames(toplot) = c("spot", "sample", "count")
toplot = toplot %>% left_join(metadata, by="sample")
return(toplot)
}
spotbarplot = function(toplot) {
ggplot(toplot,
aes(sample, count)) + geom_bar(stat='identity') +
facet_wrap(~spot) +
theme(axis.text.x = element_blank(),
text = element_text(size=8))
}
spotboxplot = function(toplot) {
ggplot(toplot,
aes(linehypo, count)) + geom_boxplot() +
facet_wrap(~spot) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
Below we look at the R^2 correlation between the expected positive control concentrations and the observed concentrations for each sample.
A set of samples have a lower correlation than the other samples.
spotbarplot(extract_pred(eset, is_positive))
posR2 = function(eset) {
pcdf = data.frame(concentration=log2(c(128, 32, 8, 2, 0.5, 0.125)),
GeneName=paste("POS", c("A", "B", "C", "D", "E", "F"), sep="_"))
pccounts = subset(exprs(eset), grepl("Positive_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
function(x) cor(x, pcdf$concentration)),
sample=colnames(pccounts)) %>%
left_join(metadata, by="sample")
ggplot(corsamples, aes(sample, correlation)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
ylab("positive control correlation") +
xlab("sample")
return(corsamples)
}
corsamples = posR2(eset)
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
We can see some samples have a higher negative control count than the other samples.
spotbarplot(extract_pred(eset, is_negative))
knitr::kable(subset(extract_pred(eset, is_negative), count > 50))
| spot | sample | count | filename | diseaseclass | month | plate_sample | spikein | extract_date | extractor | comments | ship_date | file_date | phenotype | active | es | cm | ed | zymo |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ## Spik | eIn | |||||||||||||||||
| We have | two diff | erent se | ts of spike | ins (OLD/NEW)? | But the | y don’t look mu | ch | |||||||||||
| differe | nt from e | ach othe | r: |
spotbarplot(extract_pred(eset, is_spikein))
spikebarplot = function(toplot) {
ggplot(toplot,
aes(sample, count, fill=spikein)) + geom_bar(stat='identity') +
facet_wrap(~spot) +
theme(axis.text.x = element_blank(),
text = element_text(size=8))
}
There is quite a bit of variability in the ligation controls for the samples:
ligboxplot = function(toplot) {
ggplot(toplot,
aes(phenotype, count)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
facet_wrap(~spot)
}
ligboxplot(extract_pred(eset, is_ligation))
There are some samples with a pretty bad R2 for the ligation control. We will drop these.
ligR2 = function(eset) {
pcdf = data.frame(concentration=log2(c(128, 32, 8)),
GeneName=paste("POS_", c("A", "B", "C"), sep="_"))
pccounts = subset(exprs(eset), grepl("Ligation_LIG_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
function(x) cor(x, pcdf$concentration)),
sample=colnames(pccounts)) %>%
left_join(metadata, by="sample")
print(ggplot(corsamples, aes(sample, correlation)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
ylab("ligation control correlation") +
xlab("sample"))
return(corsamples)
}
ligcor = ligR2(eset)
## Warning in cor(x, pcdf$concentration): the standard deviation is zero
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
## Warning: Removed 1 rows containing missing values (geom_point).
dropsamples = subset(ligcor, correlation < 0.9)$sample
Here we calculated ligation efficiency for the samples. This is the log of LIG_POS_A(i) / mean(LIG_POS_A) for each sample i. It doesn’t matter which direction this is in, it will get corrected for in the model. Again we can see most of the Zymo columns are outliers in this.
A small number of samples seem like the ligation step failed, so we can drop those from the analysis.
lignorm = function(eset, feature="Ligation_LIG_POS_A") {
counts = exprs(eset)
ligcounts = as.numeric(counts[grepl(feature, rownames(counts)),])
lnorm = log2((ligcounts + 1) / mean(ligcounts))
names(lnorm) = colnames(counts)
return(lnorm)
}
metadata$lignorm = lignorm(eset)
metadata$ligscale = 2^(-metadata$lignorm)
ggplot(metadata, aes(sample, lignorm)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
ylab("ligation factor") +
xlab("sample")
dropsamples = unique(c(dropsamples, subset(metadata, lignorm < -2.5)$sample))
Some housekeeping genes are expressed more highly in some samples. Not sure what we are supposed to do with this information, if there are any suggestions we could easily implement it.
spotbarplot(extract_pred(eset, is_housekeeping))
We’ll normalize the libraries and look at the negative control expression and then come up with a cutoff that is above the noise floor for the libraries. It looks like 30 is a reasonable noise floor to use.
drop_unusable = function(counts) {
drop = is_spikein(rownames(counts))
drop = drop | is_positive(rownames(counts))
drop = drop | is_housekeeping(rownames(counts))
drop = drop | is_ligation(rownames(counts))
keep = counts[!drop,]
keep = keep[, !grepl("Blank", colnames(keep))]
return(keep)
}
library(DESeq2)
dds = DESeqDataSetFromMatrix(drop_unusable(exprs(eset)), colData=metadata,
design=~sample)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = estimateSizeFactors(dds)
ncounts = data.frame(counts(dds, normalized=TRUE))
negcounts = extract_pred(ncounts, is_negative, counts=TRUE)
ggplot(extract_pred(ncounts, is_negative, counts=TRUE), aes(count)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
nfloor = 30
filter_miRNA = function(counts) {
drop = is_spikein(rownames(counts))
drop = drop | is_positive(rownames(counts))
drop = drop | is_negative(rownames(counts))
drop = drop | is_housekeeping(rownames(counts))
drop = drop | is_ligation(rownames(counts))
drop = drop | (rowSums(counts > nfloor) < (0.2 * ncol(counts)))
keep = counts[!drop,]
keep = keep[, !grepl("Blank", colnames(keep))]
return(keep)}
counts = exprs(eset)
counts = filter_miRNA(counts)
counts = counts[, ! colnames(counts) %in% dropsamples]
metadata = subset(metadata, sample %in% colnames(counts))
One of the things that came up was whether or not it is okay to normalize the counts by total library size. The idea was that in some experimental conditions, it might be that there are overall more miRNA expression than other conditions, and if we normalize by library size we will lose that information.
If that is true, then we should expect to see different library sizes for different phenotypes. We test that here by calculating the library size for each phenotype and fitting a model of the library size dependent on phenotype. We see that the positive_tolerant samples have a higher library size, which might be indicative of more miRNA expression. So we can’t just normalize by total library size, or else we will lose that information.
cdf = data.frame(phenotype=metadata$phenotype, libsize=colSums(counts))
cdf$phenotype = relevel(cdf$phenotype, ref="negative_inactive")
fit = lm(libsize~phenotype, cdf)
summary(fit)
##
## Call:
## lm(formula = libsize ~ phenotype, data = cdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99116 -38186 -20034 27008 213892
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44003 12226 3.599 0.000462 ***
## phenotypeacute -9193 17764 -0.517 0.605754
## phenotypenegative_chronic 2438 16893 0.144 0.885466
## phenotypenegative_unknown -13042 18339 -0.711 0.478341
## phenotypepositive_active 8656 17290 0.501 0.617538
## phenotypepositive_tolerant 57861 19965 2.898 0.004452 **
## phenotypespontaneous 6832 17083 0.400 0.689925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54680 on 122 degrees of freedom
## Multiple R-squared: 0.1062, Adjusted R-squared: 0.06223
## F-statistic: 2.416 on 6 and 122 DF, p-value: 0.03057
ggplot(cdf, aes(libsize)) + geom_histogram() + facet_wrap(~phenotype)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
But there are clearly variations within a class of the library size, so we have to do something to normalize it.
Here we fit two models. The first model we fit the full model, including the ligation normalization term in the model. Then fit a reduced model without the lignorm term to answer the question ‘are there miRNA that are affected by the ligation efficiency?’ We already scaled the counts via the ligation normalization scale, so here we are asking are specific genes still affected.
full = ~spikein+lignorm+phenotype
reduced = ~spikein+phenotype
ncounts = round(counts * metadata$ligscale)
dds = DESeqDataSetFromMatrix(ncounts, colData=metadata,
design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, full=full, reduced=reduced, test="LRT")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res = results(dds)
plotMA(res)
res = data.frame(res) %>% tibble::rownames_to_column() %>%
arrange(padj) %>% filter(padj < 0.05)
knitr::kable(subset(res, padj < 0.05))
| rowname | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| Endogenous1_hsa-miR-92a-3p|0_MIMAT0000092 | 70.34172 | -0.3404734 | 0.2627347 | 31.987121 | 0.0000000 | 0.0000017 |
| Endogenous1_hsa-miR-16-5p|0_MIMAT0000069 | 709.41678 | 0.5103207 | 0.3120318 | 27.752372 | 0.0000001 | 0.0000076 |
| Endogenous1_hsa-miR-144-3p|0_MIMAT0000436 | 75.86848 | -0.3216294 | 0.4332957 | 18.455379 | 0.0000174 | 0.0005605 |
| Endogenous1_hsa-miR-181a-5p|0_MIMAT0000256 | 132.73355 | 0.3623254 | 0.3765303 | 18.153404 | 0.0000204 | 0.0005605 |
| Endogenous1_hsa-miR-320e|0_MIMAT0015072 | 29.85206 | -0.3645865 | 0.3604161 | 17.421117 | 0.0000299 | 0.0006589 |
| Endogenous1_hsa-miR-23a-3p|0_MIMAT0000078 | 463.84048 | 0.1198514 | 0.3616441 | 17.023988 | 0.0000369 | 0.0006767 |
| Endogenous1_hsa-miR-520f-3p|0_MIMAT0002830 | 56.09597 | -0.1196358 | 0.3942447 | 16.548837 | 0.0000474 | 0.0007451 |
| Endogenous1_hsa-miR-28-5p|0_MIMAT0000085 | 55.87426 | -0.1561955 | 0.2447438 | 13.920096 | 0.0001907 | 0.0023314 |
| Endogenous1_hsa-miR-98-5p|0_MIMAT0000096 | 76.38813 | 0.3065609 | 0.3062086 | 14.015916 | 0.0001813 | 0.0023314 |
| Endogenous1_hsa-miR-28-3p|0_MIMAT0004502 | 20.92937 | -0.0885269 | 0.2835116 | 12.701442 | 0.0003654 | 0.0040191 |
| Endogenous1_hsa-miR-374a-5p|0_MIMAT0000727 | 162.56970 | 0.5843541 | 0.3283268 | 10.567013 | 0.0011512 | 0.0105530 |
| Endogenous1_hsa-miR-148b-3p|0_MIMAT0000759 | 37.09184 | -0.2132900 | 0.2782874 | 10.632835 | 0.0011110 | 0.0105530 |
| Endogenous1_hsa-miR-425-5p|0_MIMAT0003393 | 27.28747 | 0.1493647 | 0.2225925 | 10.191958 | 0.0014105 | 0.0119354 |
| Endogenous1_hsa-miR-194-5p|0_MIMAT0000460 | 33.88295 | 0.2167322 | 0.4289128 | 9.483772 | 0.0020730 | 0.0162876 |
| Endogenous1_hsa-miR-612|0_MIMAT0003280 | 68.02116 | -0.2486859 | 0.5189142 | 9.336776 | 0.0022460 | 0.0164707 |
| Endogenous1_hsa-miR-340-5p|0_MIMAT0004692 | 46.51750 | -0.0538673 | 0.2500769 | 9.056817 | 0.0026172 | 0.0179931 |
| Endogenous1_hsa-miR-19b-3p|0_MIMAT0000074 | 77.25965 | 0.0642163 | 0.2733084 | 8.799971 | 0.0030124 | 0.0194917 |
| Endogenous1_hsa-miR-223-3p|0_MIMAT0000280 | 2491.47725 | 0.3267386 | 0.4660603 | 8.675970 | 0.0032243 | 0.0197042 |
| Endogenous1_hsa-miR-125b-5p|0_MIMAT0000423 | 69.14712 | 0.0472137 | 0.3927637 | 8.505802 | 0.0035402 | 0.0204957 |
| Endogenous1_hsa-miR-199a-3p+hsa-miR-199b-3p|0_MIMAT0000232 | 408.66601 | 0.1798501 | 0.4091481 | 7.635129 | 0.0057242 | 0.0314833 |
| Endogenous1_hsa-miR-185-5p|0_MIMAT0000455 | 44.03114 | -0.1532540 | 0.2367752 | 6.973042 | 0.0082747 | 0.0433435 |
| Endogenous1_hsa-miR-151a-3p|0_MIMAT0000757 | 51.64657 | 0.0638871 | 0.3115583 | 6.793692 | 0.0091481 | 0.0457403 |
It looks like maybe there is some separation along the second principal component for the samples labelled as positive vs the samples labelled as negative:
dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
vst = varianceStabilizingTransformation(dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
pca_loadings = function(object, ntop=500) {
rv <- matrixStats::rowVars(assay(object))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(assay(object)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
names(percentVar) = colnames(pca$x)
pca$percentVar = percentVar
return(pca)}
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
library(dplyr)
comps = comps %>% left_join(metadata, by=c("Name"="sample"))
pca_plot = function(comps, nc1, nc2, colorby) {
c1str = paste0("PC", nc1)
c2str = paste0("PC", nc2)
ggplot(comps, aes_string(c1str, c2str, color=colorby)) +
geom_point() + theme_bw() +
xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance"))
}
pca_plot(comps, 1, 2, "phenotype")
Here we ignore the miRNA-specific ligation normalization and just do what the NanoString folks suggested, scale the counts to remove the ligation effect for each sample.
write_results = function(res, fname) {
res = data.frame(res) %>% tibble::rownames_to_column() %>%
arrange(pvalue)
write.table(res, file=fname, quote=FALSE, col.names=TRUE,
row.names=FALSE, sep=",")
}
full = ~spikein+diseaseclass
ncounts = round(counts * metadata$ligscale)
dds = DESeqDataSetFromMatrix(ncounts, colData=metadata, design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds)
res = results(dds, contrast=list(c("diseaseclass1", "diseaseclass3", "diseaseclass5"),
c("diseaseclass2", "diseaseclass4", "diseaseclass7")))
plotMA(res)
write_results(res, "active-vs-inactive.csv")
res = results(dds, contrast=list(c("diseaseclass3", "diseaseclass5"),
c("diseaseclass2", "diseaseclass4")))
plotMA(res)
write_results(res, "active-vs-inactive-chronoic.csv")
res = results(dds, contrast=list(c("diseaseclass1"), c("diseaseclass7")))
plotMA(res)
write_results(res, "active-vs-inactive-acute.csv")
res = results(dds, contrast=list(c("diseaseclass3"), c("diseaseclass2")))
plotMA(res)
write_results(res, "group3-vs-group2.csv")
res = results(dds, contrast=list(c("diseaseclass5"), c("diseaseclass4")))
plotMA(res)
write_results(res, "group5-vs-group4.csv")
res = results(dds, contrast=list(c("diseaseclass3"), c("diseaseclass4")))
plotMA(res)
write_results(res, "group3-vs-group4.csv")
res = results(dds, contrast=list(c("diseaseclass3"), c("diseaseclass7")))
plotMA(res)
write_results(res, "group3-vs-group7.csv")
res = results(dds, contrast=list(c("diseaseclass5"), c("diseaseclass7")))
plotMA(res)
write_results(res, "group5-vs-group7.csv")